{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/html": [
       "\n",
       "<style type='text/css'>\n",
       ".hll { background-color: #ffffcc }\n",
       ".c { color: #408080; font-style: italic } /* Comment */\n",
       ".err { border: 1px solid #FF0000 } /* Error */\n",
       ".k { color: #008000; font-weight: bold } /* Keyword */\n",
       ".o { color: #666666 } /* Operator */\n",
       ".ch { color: #408080; font-style: italic } /* Comment.Hashbang */\n",
       ".cm { color: #408080; font-style: italic } /* Comment.Multiline */\n",
       ".cp { color: #BC7A00 } /* Comment.Preproc */\n",
       ".cpf { color: #408080; font-style: italic } /* Comment.PreprocFile */\n",
       ".c1 { color: #408080; font-style: italic } /* Comment.Single */\n",
       ".cs { color: #408080; font-style: italic } /* Comment.Special */\n",
       ".gd { color: #A00000 } /* Generic.Deleted */\n",
       ".ge { font-style: italic } /* Generic.Emph */\n",
       ".gr { color: #FF0000 } /* Generic.Error */\n",
       ".gh { color: #000080; font-weight: bold } /* Generic.Heading */\n",
       ".gi { color: #00A000 } /* Generic.Inserted */\n",
       ".go { color: #888888 } /* Generic.Output */\n",
       ".gp { color: #000080; font-weight: bold } /* Generic.Prompt */\n",
       ".gs { font-weight: bold } /* Generic.Strong */\n",
       ".gu { color: #800080; font-weight: bold } /* Generic.Subheading */\n",
       ".gt { color: #0044DD } /* Generic.Traceback */\n",
       ".kc { color: #008000; font-weight: bold } /* Keyword.Constant */\n",
       ".kd { color: #008000; font-weight: bold } /* Keyword.Declaration */\n",
       ".kn { color: #008000; font-weight: bold } /* Keyword.Namespace */\n",
       ".kp { color: #008000 } /* Keyword.Pseudo */\n",
       ".kr { color: #008000; font-weight: bold } /* Keyword.Reserved */\n",
       ".kt { color: #B00040 } /* Keyword.Type */\n",
       ".m { color: #666666 } /* Literal.Number */\n",
       ".s { color: #BA2121 } /* Literal.String */\n",
       ".na { color: #7D9029 } /* Name.Attribute */\n",
       ".nb { color: #008000 } /* Name.Builtin */\n",
       ".nc { color: #0000FF; font-weight: bold } /* Name.Class */\n",
       ".no { color: #880000 } /* Name.Constant */\n",
       ".nd { color: #AA22FF } /* Name.Decorator */\n",
       ".ni { color: #999999; font-weight: bold } /* Name.Entity */\n",
       ".ne { color: #D2413A; font-weight: bold } /* Name.Exception */\n",
       ".nf { color: #0000FF } /* Name.Function */\n",
       ".nl { color: #A0A000 } /* Name.Label */\n",
       ".nn { color: #0000FF; font-weight: bold } /* Name.Namespace */\n",
       ".nt { color: #008000; font-weight: bold } /* Name.Tag */\n",
       ".nv { color: #19177C } /* Name.Variable */\n",
       ".ow { color: #AA22FF; font-weight: bold } /* Operator.Word */\n",
       ".w { color: #bbbbbb } /* Text.Whitespace */\n",
       ".mb { color: #666666 } /* Literal.Number.Bin */\n",
       ".mf { color: #666666 } /* Literal.Number.Float */\n",
       ".mh { color: #666666 } /* Literal.Number.Hex */\n",
       ".mi { color: #666666 } /* Literal.Number.Integer */\n",
       ".mo { color: #666666 } /* Literal.Number.Oct */\n",
       ".sb { color: #BA2121 } /* Literal.String.Backtick */\n",
       ".sc { color: #BA2121 } /* Literal.String.Char */\n",
       ".sd { color: #BA2121; font-style: italic } /* Literal.String.Doc */\n",
       ".s2 { color: #BA2121 } /* Literal.String.Double */\n",
       ".se { color: #BB6622; font-weight: bold } /* Literal.String.Escape */\n",
       ".sh { color: #BA2121 } /* Literal.String.Heredoc */\n",
       ".si { color: #BB6688; font-weight: bold } /* Literal.String.Interpol */\n",
       ".sx { color: #008000 } /* Literal.String.Other */\n",
       ".sr { color: #BB6688 } /* Literal.String.Regex */\n",
       ".s1 { color: #BA2121 } /* Literal.String.Single */\n",
       ".ss { color: #19177C } /* Literal.String.Symbol */\n",
       ".bp { color: #008000 } /* Name.Builtin.Pseudo */\n",
       ".vc { color: #19177C } /* Name.Variable.Class */\n",
       ".vg { color: #19177C } /* Name.Variable.Global */\n",
       ".vi { color: #19177C } /* Name.Variable.Instance */\n",
       ".il { color: #666666 } /* Literal.Number.Integer.Long */\n",
       "</style>\n"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "importing Jupyter notebook from review_data_getData.ipynb\n",
      "importing Jupyter notebook from restaurants_data_cleaning.ipynb\n",
      "importing Jupyter notebook from review_data_classify.ipynb\n"
     ]
    }
   ],
   "source": [
    "import pandas\n",
    "import os.path\n",
    "from math import log\n",
    "from numpy import mean, timedelta64\n",
    "\n",
    "import jupynbimp\n",
    "import review_data_getData \n",
    "from review_data_classify import ReviewSentiment"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "__COL_NAME_GROUP_ID = 'business_id'\n",
    "__COL_NAME_RATING_DATE = 'date'\n",
    "__COL_NAME_DOCUMENT_TEXT = 'text'\n",
    "__COL_NAME_RATING_VALUE = 'stars'\n",
    "__RATING_VALUE_RANGE = [1, 5]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def __calculateWeightedRating(ratingValue, ratingCount):\n",
    "    \n",
    "    # weight rating by log(ratingCount)    \n",
    "    \n",
    "    try:\n",
    "        ratingCount = int(ratingCount)\n",
    "    \n",
    "    except ValueError:\n",
    "        return nan\n",
    "    \n",
    "    if not(min(__RATING_VALUE_RANGE) <= ratingValue <= max(__RATING_VALUE_RANGE)):\n",
    "        return nan\n",
    "    \n",
    "    else:  \n",
    "        # 'neutral' rating normalized to 0\n",
    "        normalizedRating = ratingValue - mean(__RATING_VALUE_RANGE) \n",
    "        \n",
    "        return normalizedRating * log(ratingCount)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def __calculateSentiment(data):\n",
    "        \n",
    "        # calculate 'sentiment' of reviews (predicted star rating from review text)\n",
    "        # -see review_data_classify module for model fitting and tuning\n",
    "        # -fit the tuned model to the entire data and predict the star rating\n",
    "        \n",
    "        reviewSentiment = ReviewSentiment(data=data, \n",
    "                                          label=__COL_NAME_RATING_VALUE, \n",
    "                                          text=__COL_NAME_DOCUMENT_TEXT\n",
    "                                         )\n",
    "        \n",
    "        return reviewSentiment.predictSentiment(ReviewSentiment._optimalParameters)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 37,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def __aggregateReviews(data):\n",
    "    \n",
    "    data = (data\n",
    "            .groupby(__COL_NAME_GROUP_ID)\n",
    "            .aggregate({__COL_NAME_GROUP_ID: 'count',\n",
    "                        __COL_NAME_RATING_VALUE: 'mean',\n",
    "                        __COL_NAME_RATING_DATE: lambda dates: dates.max() - dates.min(),\n",
    "                        'sentiment': 'mean',\n",
    "                       })\n",
    "            .rename(columns={__COL_NAME_GROUP_ID:'review_count',\n",
    "                             __COL_NAME_RATING_DATE:'review_span'\n",
    "                            }\n",
    "                   )\n",
    "           )\n",
    "    \n",
    "    # store review_span as number of days (int)\n",
    "    data['review_span'] = (data['review_span']\n",
    "                           .astype('timedelta64[D]')\n",
    "                           .astype(int)\n",
    "                          )\n",
    "    \n",
    "    # normalize rating by review_count\n",
    "    data['rating'] = data.apply(lambda row: \n",
    "                                __calculateWeightedRating(row[__COL_NAME_RATING_VALUE], \n",
    "                                                          row['review_count']\n",
    "                                                         ),axis=1\n",
    "                               )\n",
    "    \n",
    "    return data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 35,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "def getData(fromCache=True):\n",
    "    \n",
    "    dataDirectory = '../../data/yelp/'\n",
    "    outputName = 'reviews_aggregated.csv'\n",
    "    \n",
    "    if fromCache & os.path.isfile(dataDirectory + outputName):\n",
    "        return pandas.read_csv(dataDirectory + outputName, header=0)\n",
    "    \n",
    "    else:\n",
    "    \n",
    "        data = __aggregateReviews(\n",
    "        \n",
    "            # sentiment must be calculated before aggregation\n",
    "            # (labels for classifier training come from individual reviews)\n",
    "            data=__calculateSentiment(\n",
    "            \n",
    "                # raw data from study area\n",
    "                data=review_data_getData.getReviews()\n",
    "            )\n",
    "        )\n",
    "        \n",
    "        data.reset_index(inplace=True)\n",
    "\n",
    "        # cache results and return\n",
    "        data.to_csv(dataDirectory + outputName, index=False)\n",
    "    \n",
    "        return data"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}
